# coding: utf-8
import re

import Levenshtein
from simhash import Simhash


class ShortTextSimilarity:

    def __int__(self):
        pass

    # Levenshtein编辑距离
    def _levenshtein(self, str1, str2):
        edit_num = Levenshtein.distance(ori_str, tar_str)

        # len_str1 = len(str1) + 1
        # len_str2 = len(str2) + 1
        # matrix = [0 for n in range(len_str1 * len_str2)]
        # for i in range(len_str1):
        #     matrix[i] = i
        # for j in range(0, len(matrix), len_str1):
        #     if j % len_str1 == 0:
        #         matrix[j] = j // len_str1
        # for i in range(1, len_str1):
        #     for j in range(1, len_str2):
        #         if str1[i - 1] == str2[j - 1]:
        #             cost = 0
        #         else:
        #             cost = 1
        #         matrix[j * len_str1 + i] = min(matrix[(j - 1) * len_str1 + i] + 1,
        #                                        matrix[j * len_str1 + (i - 1)] + 1,
        #                                        matrix[(j - 1) * len_str1 + (i - 1)] + cost)
        # edit_num = matrix[-1]

        return 1 - edit_num / max(len(str1), len(str2))

    # Jaccard相似度
    def _jaccard(self, text1, text2):
        char_set_1 = set(text1)
        char_set_2 = set(text2)
        union_char_set = char_set_1 & char_set_2
        intersect_char_set = char_set_1 | char_set_2
        similarity_ratio = len(union_char_set) / len(intersect_char_set)
        return similarity_ratio

    # Simhash
    def _sim_hash(self, text1, text2):
        a_simhash = Simhash(text1)
        b_simhash = Simhash(text2)
        max_hashbit = max(len(bin(a_simhash.value)), len(bin(b_simhash.value)))
        distince = a_simhash.distance(b_simhash)
        similar = 1 - distince / max_hashbit
        return similar

    # 最长公共子序列(LCS)
    def _lcs_similarity(self, str_a, str_b):
        lengths = [[0 for j in range(len(str_b) + 1)] for i in range(len(str_a) + 1)]
        for i, x in enumerate(str_a):
            for j, y in enumerate(str_b):
                if x == y:
                    lengths[i + 1][j + 1] = lengths[i][j] + 1
                else:
                    lengths[i + 1][j + 1] = max(lengths[i + 1][j], lengths[i][j + 1])
        result = ""
        x, y = len(str_a), len(str_b)
        while x != 0 and y != 0:
            if lengths[x][y] == lengths[x - 1][y]:
                x -= 1
            elif lengths[x][y] == lengths[x][y - 1]:
                y -= 1
            else:
                assert str_a[x - 1] == str_b[y - 1]
                result = str_a[x - 1] + result
                x -= 1
                y -= 1
        longestdist = lengths[len(str_a)][len(str_b)]
        ratio = longestdist / min(len(str_a), len(str_b))
        return ratio

    @staticmethod
    def gen_normalized_text(text):
        re_num = re.compile(r'[0-9IVX]+')
        re_num_Chinese = re.compile(r'[零一二三四五六七八九十]+')
        re_special_char = re.compile(r'[第件]+')

        text = re_num.sub('D', text)
        text = re_num_Chinese.sub('D', text)
        text = re_special_char.sub('D', text)
        return text


if __name__ == "__main__":
    threshold = 0.8
    ori_str = '1.1：早点下班'
    tar_str = '第一件：早点下班'
    short_text_similarity = ShortTextSimilarity()

    ori_str = short_text_similarity.gen_normalized_text(ori_str)
    tar_str = short_text_similarity.gen_normalized_text(tar_str)

    levenshtein_similarity = short_text_similarity._levenshtein(ori_str, tar_str)
    jaccard_similarity = short_text_similarity._jaccard(ori_str, tar_str)
    sim_hash_similarity = short_text_similarity._sim_hash(ori_str, tar_str)
    lcs_similarity_similarity = short_text_similarity._lcs_similarity(ori_str, tar_str)

    print(f"Levenshtein: {levenshtein_similarity}， 是否相似: {levenshtein_similarity > threshold}")
    print(f"Jaccard: {jaccard_similarity}， 是否相似: {jaccard_similarity > threshold}")
    print(f"Simhash: {sim_hash_similarity}， 是否相似: {sim_hash_similarity > threshold}")
    print(f"LCS: {lcs_similarity_similarity}， 是否相似: {lcs_similarity_similarity > threshold}")


